On August 28, 2025, the National Science Foundation announced two major advancements in America’s AI infrastructure:
The datasets were selected through a competitive process led by NSF in partnership with an interagency working group of 12 federal agencies, inviting submissions supporting AI skill development across various learning environments to help grow the nation’s AI-literate workforce.
Below, we scrape the news article to extract the names of the 10 datasets and their associated data.
nsf_news_page = rvest::read_html(
"https://www.nsf.gov/news/nsf-expanding-national-ai-infrastructure-new-data-systems"
)
datasets_tbl = tibble::tibble(
lead_university = rvest::html_elements(
nsf_news_page ,
xpath = "//*[@id=\"block-nsf-theme-content\"]/article/div/div[1]/div/div/main/div[4]/div/ul[2]/li/text()"
) |>
rvest::html_text2() |>
stringr::str_remove_all("\\(|\\)"),
name = rvest::html_elements(nsf_news_page , "ul:nth-child(14) > li > a") |>
rvest::html_text2(),
primary_url = rvest::html_elements(nsf_news_page , "ul:nth-child(14) > li > a") |>
rvest::html_attr("href"),
secondary_url = c(
"https://umfieldrobotics.github.io/ai4shipwrecks/overview/",
"https://turbulence.idies.jhu.edu/database",
"https://registry.opendata.aws/cellpainting-gallery/",
"https://database.fathomnet.org/fathomnet/#/about",
"https://github.com/SunLab-GMU/PatchDB",
"https://github.com/maryhzd/Phase-field",
"https://www.cs.purdue.edu/news/articles/2025/purdue-researchers-build-securechain-to-strengthen-software-supply-chain-security.html",
"https://www.synapse.org/Synapse:syn26133770",
"https://www.library.ucsf.edu/archives/industry-documents/",
"https://portal.opentopography.org/dataCatalog"
)
)
DT::datatable(
datasets_tbl,
extensions = c('Buttons'),
options = list(
pageLength = 10, autoWidth = TRUE, scrollX = TRUE,
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
)
)We curated information from these datasets into a
nsf_idss_curated_dataset_summaries.csv, which we read
here.
full_df = dplyr::bind_cols(
datasets_tbl,
supp_df |> dplyr::select(-c(primary_url, secondary_url))
) |>
dplyr::select(
lead_university, domain_discipline, name, description_short, data_size,
n_records_images_samples, temporal_spatial_coverage,
primary_url, secondary_url
)
readr::write_csv(full_df, "../data/nsf_idss_full_dataset_summaries.csv")
DT::datatable(
full_df,
extensions = c('Buttons'),
options = list(
pageLength = 10, autoWidth = TRUE, scrollX = TRUE,
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
)
)The NSF Focus Areas
page lists the current focus areas and subareas, which we scrape below.
We will use these to classify the datasets into focus areas. We use the
classify_datasets function defined in utils.R,
which leverages the OpenAI API to perform the
classification.
source("utils.R")
"https://www.nsf.gov/focus-areas" |>
rvest::read_html() |>
rvest::html_elements("div > div > h2 > a") |>
rvest::html_text2() -> focus_area_nodes
focus_areas_tbl = tibble::tibble(
focus_area = focus_area_nodes,
source = "https://www.nsf.gov/focus-areas"
)
"https://www.nsf.gov/focus-areas" |>
rvest::read_html() |>
rvest::html_elements("li > h4 > a") |>
rvest::html_text2() -> technology_sub_nodes
technology_subareas_tbl = tibble::tibble(
focus_area = "Technology",
subarea = technology_sub_nodes,
source = "https://www.nsf.gov/focus-areas"
)
nsf_focus_tbl = dplyr::left_join(
focus_areas_tbl, technology_subareas_tbl, by = c("focus_area", "source")
)
mapping_tbl = classify_datasets(full_df, nsf_focus_tbl) |>
dplyr::select(-dataset_name)
readr::write_csv(mapping_tbl, "../data/nsf_dataset_to_focus_area_mapping.csv")
DT::datatable(mapping_tbl)We create three plots to understand how the datasets map to NSF focus
areas and subareas, especially within Technology. The plots utilize the
chosen_focus_area column from the mapping results, which
was generated using the OpenAI API via the
classify_datasets function in utils.R.
source("utils.R")
focus_counts = mapping_tbl |>
dplyr::count(chosen_focus_area, name = "n") |>
dplyr::arrange(n) |>
dplyr::mutate(
fill_col = dplyr::if_else(stringr::str_detect(chosen_focus_area, "Technology"), miamired, "#b0b0b0"),
chosen_focus_area = forcats::fct_reorder(chosen_focus_area, n)
)
# Plot 1: Overview of focus areas
p1 = ggplot2::ggplot(focus_counts, ggplot2::aes(x = n, y = chosen_focus_area)) +
ggplot2::geom_col(ggplot2::aes(fill = fill_col), width = 0.7) +
ggplot2::scale_fill_identity() +
ggplot2::geom_text(ggplot2::aes(label = n), hjust = -0.2, size = 3.8) +
ggplot2::labs(
title = "Where the datasets map across NSF focus areas",
subtitle = "Vocabulary scraped from the official NSF <i>Our Focus Areas</i> page",
x = "Number of datasets", y = NULL,
caption = "Source: nsf.gov/focus-areas (scraped); ; MU NSF IDSS Team"
) +
ggplot2::coord_cartesian(xlim = c(0, max(focus_counts$n) * 1.15))
ggplot2::ggsave("../figs/fig01_focus_areas_overview.png", p1, width = 12, height = 5, dpi = 300)
# Plot 2: Technology subareas, highlighting Advanced Manufacturing gap
tech_levels = nsf_focus_tbl$subarea |>
unique() |>
stats::na.omit() |>
as.character() |>
sort()
mapping_tbl_with_tech = mapping_tbl |>
dplyr::mutate(
chosen_technology_subarea = dplyr::case_when(
stringr::str_starts(tidyr::replace_na(chosen_focus_area, ""), "Technology > ") ~
stringr::str_remove(chosen_focus_area, "^Technology > "),
TRUE ~ "Non-Technology"
)
)
tech_counts_raw = mapping_tbl_with_tech |>
dplyr::count(chosen_technology_subarea, name = "n")
tech_counts = tibble::tibble(subarea = tech_levels) |>
dplyr::left_join(
tech_counts_raw |> dplyr::rename(subarea = chosen_technology_subarea),
by = "subarea"
) |>
dplyr::mutate(
n = tidyr::replace_na(n, 0L),
subarea = forcats::fct_relevel(subarea, "Advanced Manufacturing", after = 0),
col = dplyr::if_else(subarea == "Advanced Manufacturing", miamired, "#b0b0b0")
) |>
dplyr::arrange(subarea)
p2 = ggplot2::ggplot(tech_counts, ggplot2::aes(y = subarea, x = n)) +
ggplot2::geom_segment(ggplot2::aes(yend = subarea, x = 0, xend = n), color = "#b0b0b0") +
ggplot2::geom_point(ggplot2::aes(color = col), size = 3.2) +
ggplot2::scale_color_identity() +
ggplot2::geom_text(ggplot2::aes(label = n), hjust = -0.4, size = 3.4) +
ggplot2::labs(
title = "Technology subareas represented by the datasets",
subtitle = "All NSF technology subareas shown to reveal gaps; <b style='color:#c3142d;'>Advanced Manufacturing</b> highlighted",
x = "Number of datasets", y = NULL,
caption = "Source: nsf.gov/focus-areas (scraped); MU NSF IDSS Team"
) +
ggplot2::coord_cartesian(xlim = c(0, max(tech_counts$n) * 1.3 + 0.5))
# Optional: explicit gap callout if zero
am_n = tech_counts$n[tech_counts$subarea == "Advanced Manufacturing"]
if (length(am_n) == 1 && am_n == 0) {
p2 = p2 +
ggplot2::annotate("label",
x = max(tech_counts$n) * 0.6, y = which(levels(tech_counts$subarea) == "Advanced Manufacturing"),
label = "Gap: No datasets mapped to Advanced Manufacturing",
color = "white", fill = miamired, label.size = NA, size = 3.5
)
}
ggplot2::ggsave("../figs/fig02_technology_subareas_gap.png", p2, width = 12, height = 5, dpi = 300)
# Plot 3: Flow (alluvial) from Domain -> Focus -> Subarea
flow_df = mapping_tbl_with_tech |>
dplyr::mutate(
sub3 = dplyr::if_else(
is.na(chosen_technology_subarea) | chosen_technology_subarea == "",
"Non-Technology",
chosen_technology_subarea
)
) |>
dplyr::count(domain_discipline, chosen_focus_area, sub3, name = "n") |>
dplyr::mutate(
fill_grp = dplyr::if_else(sub3 == "Advcanced Manufacturing", "am", "other")
)
p3 = ggplot2::ggplot(
flow_df,
ggplot2::aes(axis1 = domain_discipline, axis2 = chosen_focus_area, axis3 = sub3, y = n)
) +
ggalluvial::geom_alluvium(ggplot2::aes(fill = fill_grp), width = 0.2, alpha = 0.7) +
ggalluvial::geom_stratum(width = 0.25, fill = "#b0b0b0", color = "white") +
ggplot2::geom_text(
ggplot2::aes(
label = ggplot2::after_stat(stratum)
),
stat = ggalluvial::StatStratum,
size = 2.55,
fontface = "bold",
color = 'black'
) +
ggplot2::scale_fill_manual(
values = c(am = miamired, other = 'lightgray'),
guide = "none"
) +
ggplot2::labs(
title = "How domains map to NSF focus areas and technology subareas",
subtitle = "Widths indicate dataset counts; <b style='color:#c3142d;'>Advanced Manufacturing</b> highlighted when present",
x = NULL, y = "Datasets",
caption = "Source: nsf.gov/focus-areas (scraped); MU NSF IDSS Team"
) +
ggplot2::theme(
axis.ticks.x = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank()
)
ggplot2::ggsave("../figs/fig03_domain_to_focus_flow.png", p3, width = 12, height = 5, dpi = 300)
# Animation of the three plots
image_files = list.files("../figs/", pattern = "^fig0.*\\.png$", full.names = TRUE)
image_list = magick::image_read(image_files)
animation = magick::image_animate(image_list, delay = 750, loop = 0)
magick::image_write(animation, "../figs/nsf_idss_focus_areas_animation.gif")
animation